In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm

import ipyparallel as ipp

from time import time
from datetime import datetime

import motif as mf

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, RepeatedKFold

from scipy.stats import spearmanr
from scipy.stats import pearsonr
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
In [2]:
### set parameters for the motif analysis

PROTEIN_NAME='T7 primase'
PROT_CONC=5 # free protein concentration at binding reation; PBM typically 0.1 and RNACompete typically 0.002
BOTH_STRANDS=False # wheter both strands are present for binding; True if double-stranded DNA or RNA is used as probes

STAGES=mf.stage(protein=PROTEIN_NAME)
In [3]:
### read data

dfprobes_raw=pd.read_csv('./data/T7/chip_B_favor.csv', sep=';')
print('Columns of imported Data File: %s'%dfprobes_raw.columns)
Columns of imported Data File: Index(['0_nuc', '1_nuc', '2_nuc', '3_nuc', '4_nuc', '5_nuc', '6_nuc', '7_nuc',
       '8_nuc', '9_nuc', '10_nuc', '11_nuc', '12_nuc', '13_nuc', '14_nuc',
       '15_nuc', '16_nuc', '17_nuc', '18_nuc', '19_nuc', '20_nuc', '21_nuc',
       '22_nuc', '23_nuc', '24_nuc', '25_nuc', '26_nuc', '27_nuc', '28_nuc',
       '29_nuc', '30_nuc', '31_nuc', '32_nuc', '33_nuc', '34_nuc', '35_nuc',
       'prim_only_score', 'prim', 'poly', 'seq'],
      dtype='object')
In [4]:
### select columns for probe sequence and signal

column_sequence='seq'
column_signal='prim_only_score'
#background_signal='mean_background_intensity'
background_signal=None                             #set to None if not needed

#basic preprocessing
dfprobes_raw[column_signal]=dfprobes_raw[column_signal].apply(lambda a: np.NaN if a==' ' else a)
dfprobes_raw[column_sequence]=dfprobes_raw[column_sequence].apply(lambda a: np.NaN if a=='nan' else a)
dfprobes_raw=dfprobes_raw.dropna()

#construct new dataframe with only necessary data
if type(background_signal)==type(None):
    dfprobes=pd.DataFrame({'seq':dfprobes_raw[column_sequence].astype(str),
                           'signal binding': dfprobes_raw[column_signal].astype(np.float32)}) #rebuild dataframe
else:
    dfprobes=pd.DataFrame({'seq':dfprobes_raw[column_sequence].astype(str),
                           'signal': dfprobes_raw[column_signal].astype(np.float32), 
                           'background':dfprobes_raw[background_signal].astype(np.float32)}) #rebuild dataframe
    dfprobes['signal binding']=dfprobes['signal']-dfprobes['background']
# display main properties of data set

dfprobes['signal binding']=dfprobes['signal binding']**2  # as raw data was squared

dfprobes['signal binding'].plot(figsize=(15,5))
dfprobes.describe()

### check type of nucleic acid

dfprobes['seq']=dfprobes['seq'].apply(lambda seq: seq.upper().replace(" ", ""))  #upper and remove blanks
dfprobes['RNA']=dfprobes['seq'].apply(lambda seq: all(char in 'ACGU' for char in seq))
dfprobes['DNA']=dfprobes['seq'].apply(lambda seq: all(char in 'ACGT' for char in seq))
non_RNA_counts=len(dfprobes[dfprobes['RNA']==False])
non_DNA_counts=len(dfprobes[dfprobes['DNA']==False])

if non_RNA_counts<non_DNA_counts:
    NUC_TYPE='RNA'; print('I: RNA probes detected!')
else: NUC_TYPE='DNA'; print('I: DNA probes detected!')
    
if NUC_TYPE=='RNA' and non_RNA_counts!=0:
    print('E: The probe sequences appear to be RNA, however there are some non-RNA nucleotides in the sequences.')
    print('E: Please check the following sequnces %s'%dfprobes[dfprobes['RNA']==False])

if NUC_TYPE=='DNA' and non_DNA_counts!=0:
    print('E: The probe sequences appear to be RNA, however there are some non-RNA nucleotides in the sequences.')
    print('E: Please check the following sequnces %s'%dfprobes[dfprobes['DNA']==False])
I: DNA probes detected!
In [5]:
### option to add a constant sequence at the 3' end and 5' end
sequence_to_be_added_5='' 
sequence_to_be_added_3=''        # standard PBM arrays: CCTGTGTGAAATTGTTATCCGCTCT T7 array: GTCTTGA..
dfprobes['seq']=sequence_to_be_added_5.upper()+dfprobes['seq']+sequence_to_be_added_3.upper()
print(f"I: The nucleotide sequences {sequence_to_be_added_5.upper()} has been added to the 5' end all probe sequences.")
print(f"I: The nucleotide sequences {sequence_to_be_added_3.upper()} has been added to the 3' end all probe sequences.")
I: The nucleotide sequences  has been added to the 5' end all probe sequences.
I: The nucleotide sequences  has been added to the 3' end all probe sequences.
In [6]:
### egalize length
dfprobes['seq_length']=dfprobes['seq'].apply(len)    

if max(dfprobes['seq_length'])!=min(dfprobes['seq_length']):
    print('I: Probes length is not uniform, detected range: %i ..%i' %(min(dfprobes['seq_length']),max(dfprobes['seq_length'])))
    max_length=max(dfprobes['seq_length'])
    dfprobes['padded_sequence']=dfprobes['seq'].apply(lambda seq: seq+((max_length-len(seq))*'-'))
    print("I: Probe sequences have been padded at the 5' to the uniform length of %i nucleotides" %max_length)
else:
    print('I: Probe sequences have the uniform length of %i nucleotides' %(dfprobes['seq_length'].median()))
    dfprobes['padded_sequence']=dfprobes['seq']

print('I: Total datasets contains %i sequences.' % len(dfprobes))

# visualize composition of each position
df_nucleotides=mf.split_sequence_in_nucleotides(dfprobes['padded_sequence'])
dfcount=pd.DataFrame(index=['A', 'C', 'G', 'T', 'U', '-'])
for column in df_nucleotides:
    dfcount[column]=df_nucleotides[column].value_counts() 
dfcount=dfcount.fillna(0) #zeros for NaN
dfcount.transpose().plot(figsize=(15,5), kind='bar')
print('I: Visualisation of the base composition per position')
print('I: If positions are invariant they can be removed before sequence analysis.')
I: Probe sequences have the uniform length of 36 nucleotides
I: Total datasets contains 3149 sequences.
I: Visualisation of the base composition per position
I: If positions are invariant they can be removed before sequence analysis.
In [7]:
# You may remove invariant continuos positions by adjusting the slicing. 
# It is recommended to leave a few invariant positions to allow for binding events
# between the variable and constant part of the probes.

dfprobes['padded_sequence']=dfprobes['padded_sequence'].apply(lambda s:s[:43]) ### <==== do the slicing here

# visualize composition of each position
print('I: Visualisation of the base composition per position after slicing.')
df_nucleotides=mf.split_sequence_in_nucleotides(dfprobes['padded_sequence'])
dfcount=pd.DataFrame(index=['A', 'C', 'G', 'T', 'U', '-'])
for column in df_nucleotides:
    dfcount[column]=df_nucleotides[column].value_counts() 
dfcount=dfcount.fillna(0) #zeros for NaN
dfcount.transpose().plot(figsize=(15,5), kind='bar')
plt.show()

### Preparation for later classification
mean=dfprobes['signal binding'].mean()
std=dfprobes['signal binding'].std()
THRESHOLD=mean+2*std   
dfprobes['positive probe']=dfprobes['signal binding'].apply(lambda s: True if s>THRESHOLD else False)

print('I: The whole dataset has been used to set the threshold for a positive probe.')
print('T: The threshold is %f' %THRESHOLD)
print(f"I: {len(dfprobes[dfprobes['positive probe']])} probes of {len(dfprobes)} are above threshold.")
I: Visualisation of the base composition per position after slicing.
I: The whole dataset has been used to set the threshold for a positive probe.
T: The threshold is 0.663948
I: 196 probes of 3149 are above threshold.
In [8]:
### Shuffle and prepare dataset for training and testing

# shuffle 
dfprobes = shuffle(dfprobes)

# display histogramms of dataset
dfprobes['signal binding'].plot(kind='hist', bins=25).axvline(x=THRESHOLD, color='r', linestyle='-.', lw=0.5, label='threshold classification')

# complete data
X=mf.hotencode_sequence(dfprobes['padded_sequence'], nuc_type=NUC_TYPE)
y=np.array(dfprobes['signal binding'])
In [9]:
#### Perfrom GridCV Search for exploration of the motif length and other model parameters 
# primary goal: identify the minimum motif length which gives a good r-value
# optional: allow also for global optimization to verify whether the local optimization is good enough
# optional: vary G0 to find a good regression model
# optional: fitG0=True. This option adjust G0, so that maximal binding is around 1 (prevents saturation of probes by multiple binding events)


model_grid=mf.findmotif(protein_conc=PROT_CONC, both_strands=BOTH_STRANDS, fit_G0=True)

# prepare grid search over parameters
param_grid = {"motif_length": [1,2,3,4,5,6]}     # choose sensible range for length of motif and other parameters

# run grid search
grid_search = GridSearchCV(model_grid, param_grid=param_grid, verbose=2, 
                           cv=RepeatedKFold(n_splits=5, n_repeats=10),
                           n_jobs=-1,
                           return_train_score=True)

start = time()
grid_search.fit(X, y)
df_grid=pd.DataFrame(grid_search.cv_results_)

print("I: GridSearchCV took %.2f hours for %d candidate parameter settings."
    % ((time() - start)/3600, len(grid_search.cv_results_["params"])))
print('I: number of samples: %i' %len(X))

df_grid=pd.DataFrame(grid_search.cv_results_)

print('I: Plot of r vs motif length')
df_grid['r(train)']=df_grid['mean_train_score'].apply(np.sqrt)
df_grid['r(test)']=df_grid['mean_test_score'].apply(np.sqrt)
df_grid['std(train)']=df_grid['std_train_score'].apply(np.sqrt)
df_grid['std(test)']=df_grid['std_test_score'].apply(np.sqrt)


fig, ax=plt.subplots()
df_grid.plot(ax=ax,kind='scatter', x='param_motif_length', y='r(train)', yerr='std(train)', figsize=(5,3), c='blue').set_xticks(param_grid["motif_length"])
df_grid.plot(ax=ax,kind='scatter', x='param_motif_length', y='r(test)', yerr='std(test)', c='red')
ax.set_ylabel('r')
ax.legend(['train', 'test'])
plt.show()
df_grid[['param_motif_length','r(test)', 'r(train)']]
Fitting 50 folds for each of 6 candidates, totalling 300 fits
I: GridSearchCV took 12.73 hours for 6 candidate parameter settings.
I: number of samples: 3149
I: Plot of r vs motif length
Out[9]:
param_motif_length r(test) r(train)
0 1 0.177967 0.187764
1 2 0.663136 0.666127
2 3 0.676877 0.678723
3 4 0.669234 0.673278
4 5 0.697536 0.705088
5 6 0.696467 0.705967
In [27]:
%store df_grid
Stored 'df_grid' (DataFrame)
In [28]:
%store param_grid
Stored 'param_grid' (dict)
In [10]:
### run a number of identical optimizations with motif length found during grid search
### goal: find best motif through repetition, judge stabiltiy of optimization

CORE_MOTIF_LENGTH=4  # adjust core motif length if needed, motif length can be changed later

# prepare for ipyparallel
number_of_optimizations = 200
model_list = [mf.findmotif(motif_length=CORE_MOTIF_LENGTH, protein_conc=PROT_CONC, both_strands=BOTH_STRANDS, fit_G0=True)] * number_of_optimizations
X_list = [X] * number_of_optimizations
y_list = [y] * number_of_optimizations


def single_job(model, X, y):
    model.fit(X, y)
    return {'model':model}

# run the optimizations on ipp.cluster
start = time()
with ipp.Cluster(log_level=40) as rc:
    rc[:].use_pickle()
    view = rc.load_balanced_view()
    asyncresult = view.map_async(single_job, model_list, X_list, y_list)
    asyncresult.wait_interactive()
    result = asyncresult.get()
print("I: Optimization took %.2f hours." % ((time() - start) / 3600))


  
# assemble results and analyze
df_repetitions=pd.DataFrame(result)
df_repetitions['r']=df_repetitions['model'].apply(lambda e: mf.linregress(e.predict(X),y).rvalue)
df_repetitions['G0']=df_repetitions['model'].apply(lambda e: e.finalG0_)
df_repetitions['max binding']=df_repetitions['model'].apply(lambda e: e.max_binding_fit)
df_repetitions['min binding']=df_repetitions['model'].apply(lambda e: e.min_binding_fit)
df_repetitions['ratio'] = df_repetitions['model'].apply(lambda e: e.ratio)
df_repetitions['energies']=df_repetitions['model'].apply(lambda e: e.energies_)
df_repetitions['information']=df_repetitions['model'].apply(lambda e: mf.energies2information(e.energies_))


# display results of the ensemble of optimizations
print('I: Results of the repeated motif finding, sorted according to the regression coefficient')
df_repetitions.sort_values('r', ascending=False, inplace=True)
mf.display_df(df_repetitions, nuc_type=NUC_TYPE)
  0%|          | 0/16 [00:00<?, ?engine/s]
single_job:   0%|          | 0/200 [00:00<?, ?tasks/s]
I: Optimization took 9.36 hours.
I: Results of the repeated motif finding, sorted according to the regression coefficient
model r G0 max binding min binding ratio energies information logo
56 suppressed 0.706996 -9177.028204 0.972033 0.133311 7.291495 -7053,.. 2.362101
142 suppressed 0.706961 -10863.349327 0.964528 0.124707 7.734337 -5334,.. 2.310951
157 suppressed 0.706862 -9727.766114 0.966156 0.135750 7.117146 -6577,.. 2.317627
148 suppressed 0.706826 -11675.698748 0.967140 0.134469 7.192312 -4909,.. 2.269945
163 suppressed 0.706807 -11486.910299 0.965590 0.136809 7.057952 -5785,.. 2.214978
131 suppressed 0.706774 -9728.752465 0.979635 0.121770 8.044950 -5290,.. 2.300738
106 suppressed 0.706773 -7915.539991 0.952564 0.124481 7.652290 -8470,.. 2.364914
147 suppressed 0.706685 -10933.515501 0.965363 0.132006 7.313014 -4346,.. 2.340699
70 suppressed 0.706631 -12214.012249 0.963979 0.139102 6.930042 -5148,.. 2.223107
89 suppressed 0.706580 -11248.116017 0.968648 0.136544 7.094042 -6221,.. 2.123406
170 suppressed 0.706539 -12052.588484 0.963785 0.144152 6.685888 -4561,.. 2.256433
113 suppressed 0.706530 -11913.812017 0.969498 0.126538 7.661721 -4487,.. 2.247607
52 suppressed 0.706515 -12926.542010 0.965676 0.140475 6.874351 -4724,.. 2.109999
2 suppressed 0.706428 -12557.874320 0.958903 0.147678 6.493219 -4832,.. 2.246795
193 suppressed 0.706419 -13273.206683 0.974611 0.142844 6.822906 -4308,.. 2.144290
67 suppressed 0.706398 -12923.995193 0.967148 0.137279 7.045115 -4610,.. 2.111763
115 suppressed 0.706326 -12983.544326 0.972911 0.146713 6.631377 -5042,.. 2.079642
59 suppressed 0.706294 -12176.505872 0.965304 0.148764 6.488806 -5943,.. 2.064881
0 suppressed 0.706252 -12343.309172 0.956334 0.143811 6.649934 -5709,.. 2.072961
117 suppressed 0.706111 -12177.537911 0.970942 0.134118 7.239444 -3974,.. 2.095936
18 suppressed 0.706074 -13749.828576 0.975109 0.156092 6.246997 -4223,.. 2.058359
127 suppressed 0.706036 -10854.233141 0.953125 0.141681 6.727261 -6960,.. 2.116820
79 suppressed 0.705990 -12794.544096 0.974744 0.141340 6.896470 -4728,.. 2.046540
27 suppressed 0.705884 -10317.128912 0.967302 0.150407 6.431250 -4349,.. 2.238211
77 suppressed 0.705877 -14307.909056 0.969338 0.152928 6.338521 -4033,.. 1.948135
5 suppressed 0.705833 -13465.399731 0.968937 0.154969 6.252474 -4878,.. 1.966427
28 suppressed 0.705824 -13257.450651 0.970967 0.151509 6.408643 -5022,.. 2.031888
12 suppressed 0.705811 -11814.471246 0.965943 0.154481 6.252820 -6645,.. 2.010246
190 suppressed 0.705788 -14142.449874 0.973330 0.155344 6.265625 -4195,.. 2.043055
130 suppressed 0.705769 -13728.165226 0.967874 0.140905 6.868995 -4015,.. 2.022657
105 suppressed 0.705719 -14379.041850 0.978379 0.154051 6.350986 -4013,.. 1.993526
111 suppressed 0.705703 -12947.902083 0.970576 0.149884 6.475512 -4823,.. 1.953821
135 suppressed 0.705640 -14039.619117 0.973976 0.150254 6.482181 -3802,.. 2.004695
43 suppressed 0.705622 -14487.299010 0.967699 0.160324 6.035891 -3927,.. 1.978947
83 suppressed 0.705541 -14577.425748 0.981237 0.163550 5.999600 -4077,.. 1.946875
178 suppressed 0.705485 -14061.491830 0.968489 0.159125 6.086355 -3858,.. 1.959808
32 suppressed 0.705459 -14266.668919 0.967770 0.170622 5.672020 -4150,.. 2.023246
26 suppressed 0.705413 -14701.042064 0.978571 0.169238 5.782209 -3986,.. 1.912349
54 suppressed 0.705339 -14959.846587 0.975378 0.162206 6.013202 -3678,.. 1.902153
45 suppressed 0.705324 -13361.090070 0.968532 0.159637 6.067098 -4836,.. 2.026302
114 suppressed 0.705315 -12670.666987 0.976832 0.158935 6.146115 -6041,.. 1.986389
24 suppressed 0.705314 -14054.154743 0.980018 0.155556 6.300092 -4528,.. 1.938419
14 suppressed 0.705282 -14088.876587 0.974698 0.152134 6.406837 -3837,.. 2.043591
21 suppressed 0.705188 -8305.931026 0.962993 0.144902 6.645806 -8188,.. 2.340227
48 suppressed 0.705135 -14723.661981 0.972560 0.172590 5.635099 -3644,.. 1.984678
36 suppressed 0.705020 -14820.934033 0.977018 0.168904 5.784454 -3864,.. 1.823897
120 suppressed 0.704992 -14451.163255 0.978946 0.175247 5.586075 -4413,.. 1.908539
140 suppressed 0.704955 -12370.971274 0.974654 0.156738 6.218381 -6621,.. 1.886788
124 suppressed 0.704954 -11465.242231 0.967891 0.169187 5.720832 -7332,.. 1.946722
139 suppressed 0.704909 -13699.534998 0.972513 0.166307 5.847711 -4736,.. 2.000360
100 suppressed 0.704803 -15229.762285 0.991523 0.166159 5.967320 -3816,.. 1.799674
187 suppressed 0.704578 -14778.630303 0.961413 0.175678 5.472591 -4233,.. 1.857307
181 suppressed 0.704422 -15758.529236 0.994799 0.192793 5.159941 -3386,.. 1.755134
3 suppressed 0.704033 -14230.229409 0.971167 0.173523 5.596750 -3634,.. 1.964398
7 suppressed 0.704010 -13187.195178 0.972048 0.172629 5.630850 -5467,.. 2.011791
25 suppressed 0.703962 -15155.136559 0.974104 0.186596 5.220385 -3974,.. 1.817961
60 suppressed 0.703665 -15048.014390 1.002399 0.193495 5.180495 -4302,.. 1.708500
46 suppressed 0.703537 -14556.597842 0.979293 0.175544 5.578615 -4895,.. 1.762471
16 suppressed 0.703486 -15225.570281 0.971553 0.173898 5.586916 -3398,.. 1.814056
123 suppressed 0.703194 -16545.540324 0.981143 0.209328 4.687109 -3038,.. 1.554067
72 suppressed 0.703165 -15850.112081 0.973095 0.203243 4.787836 -3353,.. 1.773192
75 suppressed 0.698884 -18233.733094 0.996786 0.311778 3.197099 -2342,.. 1.082989
38 suppressed 0.695414 -13623.049624 0.994722 0.202557 4.910824 -3831,.. 2.396506
138 suppressed 0.694687 -15089.902655 1.014728 0.220736 4.597027 -3338,.. 2.119809
23 suppressed 0.694529 -16078.835412 1.013021 0.181807 5.571966 3802,.. 1.433959
152 suppressed 0.694514 -15316.049402 0.999945 0.148745 6.722527 4993,.. 1.550912
11 suppressed 0.694501 -16062.485560 0.998670 0.169212 5.901884 3614,.. 1.456914
112 suppressed 0.694494 -13742.600462 0.997189 0.111924 8.909516 6957,.. 1.689719
175 suppressed 0.694472 -14164.875513 0.996417 0.138673 7.185389 5902,.. 1.575634
66 suppressed 0.694472 -15002.774532 1.008638 0.126429 7.977893 5820,.. 1.604546
168 suppressed 0.694467 -13724.735721 1.004490 0.133346 7.532940 6287,.. 1.566653
184 suppressed 0.694458 -15387.275370 0.994654 0.188431 5.278623 2995,.. 1.401132
133 suppressed 0.694448 -12414.691250 0.994884 0.109906 9.052176 9312,.. 1.711392
93 suppressed 0.694435 -14169.410972 1.002387 0.124961 8.021589 9148,.. 1.639763
62 suppressed 0.694433 -12264.565203 1.005852 0.122552 8.207558 9222,.. 1.686218
183 suppressed 0.694432 -14371.123342 0.994981 0.193133 5.151792 3347,.. 1.392146
53 suppressed 0.694412 -15834.883136 1.005492 0.156837 6.411059 4776,.. 1.484806
96 suppressed 0.694405 -15150.371077 1.000244 0.148052 6.756014 3849,.. 1.513579
68 suppressed 0.694395 -13458.785667 0.997705 0.120960 8.248227 8730,.. 1.687686
176 suppressed 0.694343 -14975.228364 1.005385 0.155619 6.460571 3759,.. 1.504388
74 suppressed 0.694342 -15150.985358 1.003605 0.131100 7.655257 7033,.. 1.593152
128 suppressed 0.694338 -16536.467718 1.000250 0.227072 4.404989 2682,.. 1.298022
185 suppressed 0.694315 -13738.270855 1.009902 0.164943 6.122722 4432,.. 1.465954
173 suppressed 0.694313 -15883.089098 1.010970 0.174569 5.791239 4027,.. 1.474913
49 suppressed 0.694281 -17244.301344 0.992480 0.222056 4.469505 2967,.. 1.288843
197 suppressed 0.694280 -13643.979833 1.000934 0.194265 5.152403 2254,.. 1.498778
87 suppressed 0.694267 -16629.203447 0.995603 0.191165 5.208070 2656,.. 1.417589
171 suppressed 0.694237 -14655.811006 1.003676 0.138184 7.263338 7563,.. 1.574202
134 suppressed 0.694172 -8769.493202 1.000198 0.090513 11.050361 14864,.. 1.768039
6 suppressed 0.694171 -15555.987352 0.994548 0.158038 6.293098 5609,.. 1.512444
107 suppressed 0.693943 -14159.521215 1.008739 0.152237 6.626090 3627,.. 1.475668
98 suppressed 0.693925 -16305.551122 1.003250 0.175467 5.717609 5049,.. 1.439092
19 suppressed 0.693917 -13874.488687 1.017081 0.181240 5.611794 4695,.. 1.447960
71 suppressed 0.693914 -16364.352404 0.996756 0.171728 5.804261 5454,.. 1.415694
137 suppressed 0.693901 -15117.836331 0.989319 0.156571 6.318654 2208,.. 1.636275
92 suppressed 0.693895 -16730.019051 1.005339 0.200997 5.001760 3714,.. 1.329585
151 suppressed 0.693877 -16347.669237 1.001342 0.167465 5.979419 5545,.. 1.433163
177 suppressed 0.693819 -16232.167039 1.000425 0.232371 4.305291 1694,.. 1.396094
76 suppressed 0.693783 -14796.647482 0.990767 0.143743 6.892647 1966,.. 1.729334
169 suppressed 0.693747 -14755.260130 0.996178 0.135208 7.367745 3340,.. 1.640316
41 suppressed 0.693740 -17832.648444 1.000571 0.273132 3.663323 2284,.. 1.174987
29 suppressed 0.693730 -16794.469280 1.006752 0.190572 5.282797 4328,.. 1.349449
174 suppressed 0.693650 -16425.707625 1.000743 0.190659 5.248862 2101,.. 1.450030
73 suppressed 0.693591 -15905.953102 0.997153 0.195337 5.104782 1864,.. 1.526860
22 suppressed 0.693492 -16180.359940 0.999066 0.145244 6.878535 4600,.. 1.491685
37 suppressed 0.693463 -12675.489582 1.004201 0.131172 7.655599 1798,.. 1.885736
51 suppressed 0.693445 -15151.229348 0.994113 0.132859 7.482468 6056,.. 1.614267
63 suppressed 0.693369 -15653.765872 0.999983 0.153297 6.523169 2486,.. 1.631502
146 suppressed 0.693074 -16487.993317 1.017384 0.179676 5.662324 6159,.. 1.396452
50 suppressed 0.693036 -14052.624639 0.970005 0.117852 8.230672 6736,.. 1.675559
20 suppressed 0.693028 -12364.355925 1.003365 0.115630 8.677385 3367,.. 1.765088
194 suppressed 0.692945 -15805.385445 0.989678 0.149223 6.632210 6822,.. 1.489779
80 suppressed 0.692882 -14273.457012 1.004135 0.139847 7.180234 10160,.. 1.578089
116 suppressed 0.692686 -18097.210380 1.016817 0.285228 3.564926 2972,.. 1.087331
132 suppressed 0.692673 -18432.020836 0.989300 0.318265 3.108419 1567,.. 1.016520
180 suppressed 0.692585 -16033.236868 1.001339 0.156398 6.402497 1619,.. 1.558806
15 suppressed 0.692562 -18077.831238 0.999079 0.290434 3.439956 1318,.. 1.106852
9 suppressed 0.692526 -14921.955589 1.000281 0.090554 11.046244 4794,.. 1.701320
167 suppressed 0.692142 -13294.262711 1.001934 0.249682 4.012837 927,.. 1.347249
65 suppressed 0.692015 -15158.321299 0.999915 0.268486 3.724273 2478,.. 1.166006
40 suppressed 0.691969 -14626.548881 0.998482 0.177429 5.627515 999,.. 1.559129
85 suppressed 0.691758 -12029.802566 0.987214 0.056552 17.456793 5942,.. 1.867132
145 suppressed 0.691695 -14977.745283 0.998701 0.092298 10.820350 4117,.. 1.700267
121 suppressed 0.691649 -16852.495659 1.002502 0.198143 5.059479 1279,.. 1.448922
109 suppressed 0.691624 11.105179 0.990409 0.092495 10.707753 -13237,.. 2.294190
42 suppressed 0.691413 -14119.200441 0.999907 0.162818 6.141246 1923,.. 1.683997
102 suppressed 0.691146 -15496.540162 1.004959 0.155285 6.471688 6389,.. 1.556367
95 suppressed 0.690974 -13939.120994 0.999838 0.264063 3.786364 -6590,.. 1.228277
88 suppressed 0.690820 -14844.309111 1.014893 0.167929 6.043600 2945,.. 1.416806
188 suppressed 0.690502 -12473.763646 0.994681 0.161716 6.150772 480,.. 1.511505
4 suppressed 0.690244 -15635.011527 1.001321 0.339025 2.953533 808,.. 1.074127
182 suppressed 0.689944 -14860.967741 1.017382 0.367334 2.769637 979,.. 1.026860
136 suppressed 0.689725 -15719.484869 1.009689 0.213316 4.733294 1237,.. 1.362489
108 suppressed 0.689267 -16868.286217 1.010455 0.398269 2.537115 1003,.. 0.951770
179 suppressed 0.688211 -14695.841635 1.002931 0.144639 6.934023 3931,.. 1.532106
78 suppressed 0.688161 -14766.394484 0.995935 0.285833 3.484327 269,.. 1.178970
58 suppressed 0.688110 -14520.763303 0.993134 0.221677 4.480086 -57,.. 1.317097
158 suppressed 0.687926 -12973.184141 1.006087 0.053497 18.806378 6828,.. 2.049839
69 suppressed 0.687829 -15418.965515 0.993655 0.194727 5.102813 67,.. 1.446613
97 suppressed 0.687648 -14316.538926 0.998669 0.093484 10.682791 334,.. 1.750742
101 suppressed 0.687579 -15265.916851 1.000543 0.210953 4.742970 54,.. 1.394806
144 suppressed 0.687376 -17053.219866 0.996680 0.255810 3.896177 267,.. 1.238990
199 suppressed 0.687220 -11473.991534 0.986581 0.147360 6.695019 -67,.. 1.615409
150 suppressed 0.687185 -18000.814671 0.999958 0.310344 3.222093 99,.. 1.064935
119 suppressed 0.687071 -15847.464550 1.018856 0.171281 5.948439 4099,.. 1.444477
104 suppressed 0.686985 -17326.854985 1.002006 0.357138 2.805658 456,.. 1.042707
159 suppressed 0.686388 -12400.171387 0.968001 0.177878 5.441929 -209,.. 1.478802
61 suppressed 0.686180 -18478.487653 1.000314 0.323603 3.091179 6,.. 0.965752
91 suppressed 0.685937 -16890.233842 0.990974 0.403501 2.455938 945,.. 0.877447
39 suppressed 0.685844 -12491.236604 0.992076 0.189296 5.240883 14,.. 1.431698
84 suppressed 0.685798 -16772.993369 1.001093 0.164108 6.100214 533,.. 1.397848
164 suppressed 0.685721 -15322.547333 1.000197 0.268926 3.719232 -64,.. 1.206657
191 suppressed 0.685667 -13083.237456 0.997780 0.051077 19.534712 9402,.. 1.822835
34 suppressed 0.684785 -19254.480430 1.010454 0.423364 2.386724 927,.. 0.801820
186 suppressed 0.684724 -14336.818556 0.995270 0.065934 15.094839 4920,.. 1.799114
125 suppressed 0.684648 -7343.474624 1.003233 0.003889 257.959484 13378,.. 2.425804
195 suppressed 0.684487 -15088.262603 1.010624 0.081418 12.412841 4535,.. 1.707507
44 suppressed 0.684426 -12602.203000 0.992964 0.118584 8.373496 2830,.. 1.587170
57 suppressed 0.684315 -15432.307927 0.987260 0.103845 9.507039 3405,.. 1.599209
35 suppressed 0.684293 -15977.929676 1.005304 0.146402 6.866719 3560,.. 1.468133
90 suppressed 0.683921 -10368.230592 1.006699 0.020568 48.945316 9070,.. 2.053029
8 suppressed 0.683656 -14324.525128 1.003344 0.067062 14.961539 3683,.. 1.755301
143 suppressed 0.683650 -13322.100404 1.004908 0.044540 22.561888 4260,.. 1.889706
149 suppressed 0.683370 -9704.420410 0.989848 0.040784 24.270205 3023,.. 1.967927
13 suppressed 0.683208 -12418.264857 1.002041 0.032487 30.844432 3250,.. 1.941987
81 suppressed 0.683157 -8065.420824 0.997320 0.026629 37.452357 5560,.. 2.111768
155 suppressed 0.683082 -5851.033242 1.003461 0.015278 65.681385 4198,.. 2.211873
162 suppressed 0.683080 -13541.997285 1.003286 0.067781 14.801812 2761,.. 1.737107
118 suppressed 0.682673 -12636.113605 0.996464 0.097281 10.243117 784,.. 1.652231
129 suppressed 0.682144 -15346.049849 1.001480 0.170969 5.857670 1160,.. 1.378440
10 suppressed 0.682071 -15331.747861 0.999931 0.149585 6.684724 823,.. 1.418027
94 suppressed 0.681737 -9516.486317 0.999176 0.029912 33.403877 2670,.. 2.135083
126 suppressed 0.681299 -16683.012731 1.014760 0.192281 5.277496 1351,.. 1.308661
47 suppressed 0.681072 -11282.463532 0.985347 0.102623 9.601607 1039,.. 1.626720
1 suppressed 0.680545 -14839.037213 1.003692 0.083693 11.992469 829,.. 1.717836
31 suppressed 0.680447 -14652.873022 1.015388 0.171427 5.923161 1006,.. 1.402128
110 suppressed 0.680445 -10737.317516 0.996909 0.038054 26.197516 1706,.. 1.984588
103 suppressed 0.680326 -9534.988898 1.027888 0.061813 16.629039 2996,.. 1.967403
198 suppressed 0.680190 -14300.365282 1.009282 0.350376 2.880571 -7534,.. 1.057658
64 suppressed 0.680186 -13690.852850 0.974986 0.254555 3.830155 1094,.. 1.124125
172 suppressed 0.679898 -12051.276254 0.978729 0.162753 6.013589 456,.. 1.591660
156 suppressed 0.679765 -9414.953722 1.018092 0.040578 25.089760 809,.. 2.035834
86 suppressed 0.678314 -18485.870253 1.008287 0.315098 3.199922 425,.. 0.955099
166 suppressed 0.674830 -10213.109846 1.006027 0.009063 111.007384 589,.. 2.314159
17 suppressed 0.674305 -7399.358588 0.994818 0.006210 160.208548 -26,.. 2.443177
33 suppressed 0.674213 -11999.325020 0.997773 0.026210 38.067764 947,.. 2.166395
153 suppressed 0.667498 -10779.112703 0.995397 0.125103 7.956619 3265,.. 1.909201
160 suppressed 0.666145 -12184.310797 0.953825 0.092485 10.313298 1992,.. 1.802685
154 suppressed 0.660428 -11935.061554 1.001120 0.144910 6.908575 4217,.. 1.810810
161 suppressed 0.658164 -9106.352917 1.010160 0.048330 20.901503 -8894,.. 2.054313
122 suppressed 0.658141 -12382.678106 1.008789 0.070623 14.284063 -6513,.. 1.926606
165 suppressed 0.657027 -10913.849168 1.014197 0.064141 15.811935 -7706,.. 1.997895
192 suppressed 0.641364 -3971.288308 0.993242 0.033069 30.035167 4877,.. 2.237789
189 suppressed 0.637593 -17525.524191 1.015197 0.222545 4.561765 1681,.. 1.244728
30 suppressed 0.614605 -1530.266321 0.993504 0.000391 2542.547937 1741,.. 2.684948
82 suppressed 0.541329 870.648161 1.005770 0.000019 53113.958037 5372,.. 3.934987
99 suppressed 0.507435 -6071.440214 1.026697 0.027348 37.541558 4211,.. 3.066611
141 suppressed 0.488867 -1285.003906 0.931659 0.009375 99.375559 845,.. 2.713847
55 suppressed 0.474265 -12607.693930 1.177187 0.005371 219.186429 8453,.. 1.726284
196 suppressed 0.299159 -8098.683107 0.946400 0.012435 76.108961 4247,.. 2.373405
In [23]:
df_repetitions[:-int(len(df_repetitions)/20)] #delete 5% worst
Out[23]:
model r G0 max binding min binding ratio energies information logo PCA1 PCA2
56 findmotif(both_strands=False, fit_G0=True, mot... 0.706996 -9177.028204 0.972033 0.133311 7.291495 [-7053.185480104829, 14371.734708221606, -8323... 2.362101 <PIL.Image.Image image mode=RGB size=1200x900 ... 19825.322932 -1553.438968
142 findmotif(both_strands=False, fit_G0=True, mot... 0.706961 -10863.349327 0.964528 0.124707 7.734337 [-5334.0554370904765, 9281.749185392397, -6488... 2.310951 <PIL.Image.Image image mode=RGB size=1200x900 ... 16781.517546 -769.408837
157 findmotif(both_strands=False, fit_G0=True, mot... 0.706862 -9727.766114 0.966156 0.135750 7.117146 [-6577.773398411619, 13912.727582882128, -7976... 2.317627 <PIL.Image.Image image mode=RGB size=1200x900 ... 19146.576793 -1523.318826
148 findmotif(both_strands=False, fit_G0=True, mot... 0.706826 -11675.698748 0.967140 0.134469 7.192312 [-4909.541315277625, 8409.74004047994, -6169.3... 2.269945 <PIL.Image.Image image mode=RGB size=1200x900 ... 15561.083980 -897.311099
163 findmotif(both_strands=False, fit_G0=True, mot... 0.706807 -11486.910299 0.965590 0.136809 7.057952 [-5785.075296653869, 11148.827507828752, -6951... 2.214978 <PIL.Image.Image image mode=RGB size=1200x900 ... 16311.527973 -1288.988453
... ... ... ... ... ... ... ... ... ... ... ...
110 findmotif(both_strands=False, fit_G0=True, mot... 0.680445 -10737.317516 0.996909 0.038054 26.197516 [1706.4767801559324, -4603.711766799798, 7958.... 1.984588 <PIL.Image.Image image mode=RGB size=1200x900 ... -17787.806490 -1832.951702
103 findmotif(both_strands=False, fit_G0=True, mot... 0.680326 -9534.988898 1.027888 0.061813 16.629039 [2996.130586825946, -2864.2646122122032, 2348.... 1.967403 <PIL.Image.Image image mode=RGB size=1200x900 ... -18716.013085 -6072.519659
198 findmotif(both_strands=False, fit_G0=True, mot... 0.680190 -14300.365282 1.009282 0.350376 2.880571 [-7534.602343903572, 17518.12083155398, -6352.... 1.057658 <PIL.Image.Image image mode=RGB size=1200x900 ... 14420.431194 -580.803961
64 findmotif(both_strands=False, fit_G0=True, mot... 0.680186 -13690.852850 0.974986 0.254555 3.830155 [1094.2766885463207, -722.0191909075406, 474.3... 1.124125 <PIL.Image.Image image mode=RGB size=1200x900 ... -2341.691988 19657.579257
172 findmotif(both_strands=False, fit_G0=True, mot... 0.679898 -12051.276254 0.978729 0.162753 6.013589 [456.14039991784085, 20.44544959578485, 0.7665... 1.591660 <PIL.Image.Image image mode=RGB size=1200x900 ... 3391.185915 -12443.880252

181 rows × 11 columns

In [11]:
df_repetitions=df_repetitions[:-int(len(df_repetitions)/20)] #delete 5% worst
In [12]:
### compare energy matrices of ensemble using PCA
print('I: Histogram of the regression coefficients r obtained by repeated optimizaion with the subset.')
df_repetitions['r'].plot(kind='hist')
plt.show()

pca = PCA(n_components=2)
pca_2c=pca.fit_transform(df_repetitions['energies'].tolist())    
df_repetitions[['PCA1', 'PCA2']]=pca_2c
print('I: 2-dimensional PCA explained %i %% of variance.' %(sum(pca.explained_variance_ratio_)*100))
if sum(pca.explained_variance_ratio_)<0.5:
      print('W: 2-dimensional PCA explained only %i %% of variance' %(sum(pca.explained_variance_ratio_)*100))

print('I: Visualization of the PCA with the regression quality by color.')        
df_repetitions.plot(x='PCA1', y='PCA2', kind='scatter', c='r',cmap=cm.coolwarm, edgecolors='black', linewidths=0.3)
I: Histogram of the regression coefficients r obtained by repeated optimizaion with the subset.
I: 2-dimensional PCA explained 79 % of variance.
I: Visualization of the PCA with the regression quality by color.
/home/GLipps/.local/lib/python3.8/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute `n_features_` was deprecated in version 1.2 and will be removed in 1.4. Use `n_features_in_` instead.
  warnings.warn(msg, category=FutureWarning)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe661562a30>
In [30]:
pca.explained_variance_ratio_
Out[30]:
array([0.62422327, 0.17305216])
In [29]:
fig=df_repetitions.plot(x='PCA1', y='PCA2', kind='scatter', c='r',cmap=cm.coolwarm, edgecolors='black', 
                        linewidths=0.3).get_figure()
fig.savefig('PCA_no_padding.svg')
In [13]:
mf.display_df(df_repetitions, nuc_type=NUC_TYPE)
model r G0 max binding min binding ratio energies information logo PCA1 PCA2
56 suppressed 0.706996 -9177.028204 0.972033 0.133311 7.291495 -7053,.. 2.362101 19825.322932 -1553.438968
142 suppressed 0.706961 -10863.349327 0.964528 0.124707 7.734337 -5334,.. 2.310951 16781.517546 -769.408837
157 suppressed 0.706862 -9727.766114 0.966156 0.135750 7.117146 -6577,.. 2.317627 19146.576793 -1523.318826
148 suppressed 0.706826 -11675.698748 0.967140 0.134469 7.192312 -4909,.. 2.269945 15561.083980 -897.311099
163 suppressed 0.706807 -11486.910299 0.965590 0.136809 7.057952 -5785,.. 2.214978 16311.527973 -1288.988453
131 suppressed 0.706774 -9728.752465 0.979635 0.121770 8.044950 -5290,.. 2.300738 18368.688916 -648.153351
106 suppressed 0.706773 -7915.539991 0.952564 0.124481 7.652290 -8470,.. 2.364914 21767.273901 -1973.738421
147 suppressed 0.706685 -10933.515501 0.965363 0.132006 7.313014 -4346,.. 2.340699 16241.576656 -51.849408
70 suppressed 0.706631 -12214.012249 0.963979 0.139102 6.930042 -5148,.. 2.223107 14771.727280 -1215.450068
89 suppressed 0.706580 -11248.116017 0.968648 0.136544 7.094042 -6221,.. 2.123406 17131.145911 -1241.442253
170 suppressed 0.706539 -12052.588484 0.963785 0.144152 6.685888 -4561,.. 2.256433 14838.411635 -994.248033
113 suppressed 0.706530 -11913.812017 0.969498 0.126538 7.661721 -4487,.. 2.247607 15168.678487 -310.031692
52 suppressed 0.706515 -12926.542010 0.965676 0.140475 6.874351 -4724,.. 2.109999 13999.504943 -816.220411
2 suppressed 0.706428 -12557.874320 0.958903 0.147678 6.493219 -4832,.. 2.246795 14139.117443 -1242.466310
193 suppressed 0.706419 -13273.206683 0.974611 0.142844 6.822906 -4308,.. 2.144290 13243.097834 -701.941397
67 suppressed 0.706398 -12923.995193 0.967148 0.137279 7.045115 -4610,.. 2.111763 14050.827382 -776.543545
115 suppressed 0.706326 -12983.544326 0.972911 0.146713 6.631377 -5042,.. 2.079642 14094.253400 -1114.332846
59 suppressed 0.706294 -12176.505872 0.965304 0.148764 6.488806 -5943,.. 2.064881 15677.738878 -1563.947147
0 suppressed 0.706252 -12343.309172 0.956334 0.143811 6.649934 -5709,.. 2.072961 15282.208651 -1500.888004
117 suppressed 0.706111 -12177.537911 0.970942 0.134118 7.239444 -3974,.. 2.095936 14931.934693 429.135675
18 suppressed 0.706074 -13749.828576 0.975109 0.156092 6.246997 -4223,.. 2.058359 12819.950581 -817.226814
127 suppressed 0.706036 -10854.233141 0.953125 0.141681 6.727261 -6960,.. 2.116820 17936.973959 -2214.055464
79 suppressed 0.705990 -12794.544096 0.974744 0.141340 6.896470 -4728,.. 2.046540 14346.213502 -439.569626
27 suppressed 0.705884 -10317.128912 0.967302 0.150407 6.431250 -4349,.. 2.238211 16939.020766 -681.852972
77 suppressed 0.705877 -14307.909056 0.969338 0.152928 6.338521 -4033,.. 1.948135 11968.350919 -626.428754
5 suppressed 0.705833 -13465.399731 0.968937 0.154969 6.252474 -4878,.. 1.966427 13548.939020 -933.322459
28 suppressed 0.705824 -13257.450651 0.970967 0.151509 6.408643 -5022,.. 2.031888 13742.292223 -1352.316119
12 suppressed 0.705811 -11814.471246 0.965943 0.154481 6.252820 -6645,.. 2.010246 16481.500242 -2044.963493
190 suppressed 0.705788 -14142.449874 0.973330 0.155344 6.265625 -4195,.. 2.043055 11921.744929 -959.975450
130 suppressed 0.705769 -13728.165226 0.967874 0.140905 6.868995 -4015,.. 2.022657 12664.142830 -352.722105
105 suppressed 0.705719 -14379.041850 0.978379 0.154051 6.350986 -4013,.. 1.993526 11651.704367 -926.449523
111 suppressed 0.705703 -12947.902083 0.970576 0.149884 6.475512 -4823,.. 1.953821 14488.182566 -550.496706
135 suppressed 0.705640 -14039.619117 0.973976 0.150254 6.482181 -3802,.. 2.004695 12390.326511 -304.126450
43 suppressed 0.705622 -14487.299010 0.967699 0.160324 6.035891 -3927,.. 1.978947 11565.797799 -943.929132
83 suppressed 0.705541 -14577.425748 0.981237 0.163550 5.999600 -4077,.. 1.946875 11633.274617 -989.040420
178 suppressed 0.705485 -14061.491830 0.968489 0.159125 6.086355 -3858,.. 1.959808 12302.571146 -253.736307
32 suppressed 0.705459 -14266.668919 0.967770 0.170622 5.672020 -4150,.. 2.023246 11857.043778 -1355.668409
26 suppressed 0.705413 -14701.042064 0.978571 0.169238 5.782209 -3986,.. 1.912349 11466.745134 -733.490110
54 suppressed 0.705339 -14959.846587 0.975378 0.162206 6.013202 -3678,.. 1.902153 10845.393256 -666.955906
45 suppressed 0.705324 -13361.090070 0.968532 0.159637 6.067098 -4836,.. 2.026302 13407.079649 -1470.679646
114 suppressed 0.705315 -12670.666987 0.976832 0.158935 6.146115 -6041,.. 1.986389 15070.371758 -2014.172460
24 suppressed 0.705314 -14054.154743 0.980018 0.155556 6.300092 -4528,.. 1.938419 12638.081816 -997.882414
14 suppressed 0.705282 -14088.876587 0.974698 0.152134 6.406837 -3837,.. 2.043591 11932.788908 -1056.594018
21 suppressed 0.705188 -8305.931026 0.962993 0.144902 6.645806 -8188,.. 2.340227 21291.674323 -1969.240440
48 suppressed 0.705135 -14723.661981 0.972560 0.172590 5.635099 -3644,.. 1.984678 11012.099537 -1135.085789
36 suppressed 0.705020 -14820.934033 0.977018 0.168904 5.784454 -3864,.. 1.823897 11427.236652 -409.257273
120 suppressed 0.704992 -14451.163255 0.978946 0.175247 5.586075 -4413,.. 1.908539 11918.222660 -1354.407693
140 suppressed 0.704955 -12370.971274 0.974654 0.156738 6.218381 -6621,.. 1.886788 15890.019253 -1998.100752
124 suppressed 0.704954 -11465.242231 0.967891 0.169187 5.720832 -7332,.. 1.946722 17430.891653 -2504.510853
139 suppressed 0.704909 -13699.534998 0.972513 0.166307 5.847711 -4736,.. 2.000360 12896.912172 -1539.780510
100 suppressed 0.704803 -15229.762285 0.991523 0.166159 5.967320 -3816,.. 1.799674 10755.995452 -589.810876
187 suppressed 0.704578 -14778.630303 0.961413 0.175678 5.472591 -4233,.. 1.857307 11279.798987 -1160.039032
181 suppressed 0.704422 -15758.529236 0.994799 0.192793 5.159941 -3386,.. 1.755134 10047.228292 -689.029558
3 suppressed 0.704033 -14230.229409 0.971167 0.173523 5.596750 -3634,.. 1.964398 11583.092292 -1240.207485
7 suppressed 0.704010 -13187.195178 0.972048 0.172629 5.630850 -5467,.. 2.011791 13803.072026 -2213.981689
25 suppressed 0.703962 -15155.136559 0.974104 0.186596 5.220385 -3974,.. 1.817961 10780.247655 -1283.696949
60 suppressed 0.703665 -15048.014390 1.002399 0.193495 5.180495 -4302,.. 1.708500 11684.422506 -767.178284
46 suppressed 0.703537 -14556.597842 0.979293 0.175544 5.578615 -4895,.. 1.762471 12243.393601 -1571.594710
16 suppressed 0.703486 -15225.570281 0.971553 0.173898 5.586916 -3398,.. 1.814056 10347.076762 -929.536865
123 suppressed 0.703194 -16545.540324 0.981143 0.209328 4.687109 -3038,.. 1.554067 8884.689560 -406.982919
72 suppressed 0.703165 -15850.112081 0.973095 0.203243 4.787836 -3353,.. 1.773192 9331.497704 -1317.958182
75 suppressed 0.698884 -18233.733094 0.996786 0.311778 3.197099 -2342,.. 1.082989 6792.692662 194.093183
38 suppressed 0.695414 -13623.049624 0.994722 0.202557 4.910824 -3831,.. 2.396506 8711.652512 -5313.586987
138 suppressed 0.694687 -15089.902655 1.014728 0.220736 4.597027 -3338,.. 2.119809 7852.274101 -4247.636299
23 suppressed 0.694529 -16078.835412 1.013021 0.181807 5.571966 3802,.. 1.433959 -10023.210293 -637.569871
152 suppressed 0.694514 -15316.049402 0.999945 0.148745 6.722527 4993,.. 1.550912 -11111.614743 -468.139196
11 suppressed 0.694501 -16062.485560 0.998670 0.169212 5.901884 3614,.. 1.456914 -9724.916594 -835.648705
112 suppressed 0.694494 -13742.600462 0.997189 0.111924 8.909516 6957,.. 1.689719 -13587.249131 -837.856520
175 suppressed 0.694472 -14164.875513 0.996417 0.138673 7.185389 5902,.. 1.575634 -13248.641898 -1130.557415
66 suppressed 0.694472 -15002.774532 1.008638 0.126429 7.977893 5820,.. 1.604546 -11564.776019 -506.606129
168 suppressed 0.694467 -13724.735721 1.004490 0.133346 7.532940 6287,.. 1.566653 -14218.684203 -1415.891101
184 suppressed 0.694458 -15387.275370 0.994654 0.188431 5.278623 2995,.. 1.401132 -11123.177265 -1788.048732
133 suppressed 0.694448 -12414.691250 0.994884 0.109906 9.052176 9312,.. 1.711392 -15688.551544 -1395.469676
93 suppressed 0.694435 -14169.410972 1.002387 0.124961 8.021589 9148,.. 1.639763 -12731.429311 16.455778
62 suppressed 0.694433 -12264.565203 1.005852 0.122552 8.207558 9222,.. 1.686218 -16241.115544 -1299.554446
183 suppressed 0.694432 -14371.123342 0.994981 0.193133 5.151792 3347,.. 1.392146 -13286.716322 -2216.436068
53 suppressed 0.694412 -15834.883136 1.005492 0.156837 6.411059 4776,.. 1.484806 -10460.244360 -203.317795
96 suppressed 0.694405 -15150.371077 1.000244 0.148052 6.756014 3849,.. 1.513579 -11384.948211 -1342.632396
68 suppressed 0.694395 -13458.785667 0.997705 0.120960 8.248227 8730,.. 1.687686 -14068.475141 -372.668187
176 suppressed 0.694343 -14975.228364 1.005385 0.155619 6.460571 3759,.. 1.504388 -11672.541902 -1730.282171
74 suppressed 0.694342 -15150.985358 1.003605 0.131100 7.655257 7033,.. 1.593152 -11181.438381 82.700783
128 suppressed 0.694338 -16536.467718 1.000250 0.227072 4.404989 2682,.. 1.298022 -9320.232085 -954.802551
185 suppressed 0.694315 -13738.270855 1.009902 0.164943 6.122722 4432,.. 1.465954 -14439.167424 -2029.086214
173 suppressed 0.694313 -15883.089098 1.010970 0.174569 5.791239 4027,.. 1.474913 -10251.825983 -556.507463
49 suppressed 0.694281 -17244.301344 0.992480 0.222056 4.469505 2967,.. 1.288843 -7734.327905 -274.185714
197 suppressed 0.694280 -13643.979833 1.000934 0.194265 5.152403 2254,.. 1.498778 -13408.452129 -4116.620574
87 suppressed 0.694267 -16629.203447 0.995603 0.191165 5.208070 2656,.. 1.417589 -8220.411749 -1115.364132
171 suppressed 0.694237 -14655.811006 1.003676 0.138184 7.263338 7563,.. 1.574202 -12387.416218 -72.885598
134 suppressed 0.694172 -8769.493202 1.000198 0.090513 11.050361 14864,.. 1.768039 -21542.687741 -2743.714968
6 suppressed 0.694171 -15555.987352 0.994548 0.158038 6.293098 5609,.. 1.512444 -10892.025742 25.127517
107 suppressed 0.693943 -14159.521215 1.008739 0.152237 6.626090 3627,.. 1.475668 -13374.189378 -2417.045582
98 suppressed 0.693925 -16305.551122 1.003250 0.175467 5.717609 5049,.. 1.439092 -9610.628337 425.002572
19 suppressed 0.693917 -13874.488687 1.017081 0.181240 5.611794 4695,.. 1.447960 -14482.376802 -1655.864059
71 suppressed 0.693914 -16364.352404 0.996756 0.171728 5.804261 5454,.. 1.415694 -9540.638665 400.032835
137 suppressed 0.693901 -15117.836331 0.989319 0.156571 6.318654 2208,.. 1.636275 -9712.776640 -3152.350007
92 suppressed 0.693895 -16730.019051 1.005339 0.200997 5.001760 3714,.. 1.329585 -9253.567844 53.840156
151 suppressed 0.693877 -16347.669237 1.001342 0.167465 5.979419 5545,.. 1.433163 -9411.976001 345.875164
177 suppressed 0.693819 -16232.167039 1.000425 0.232371 4.305291 1694,.. 1.396094 -8720.478226 -2582.689007
76 suppressed 0.693783 -14796.647482 0.990767 0.143743 6.892647 1966,.. 1.729334 -9296.541519 -4186.106620
169 suppressed 0.693747 -14755.260130 0.996178 0.135208 7.367745 3340,.. 1.640316 -11050.095772 -2165.412654
41 suppressed 0.693740 -17832.648444 1.000571 0.273132 3.663323 2284,.. 1.174987 -6705.237699 -690.279919
29 suppressed 0.693730 -16794.469280 1.006752 0.190572 5.282797 4328,.. 1.349449 -9075.294297 259.033757
174 suppressed 0.693650 -16425.707625 1.000743 0.190659 5.248862 2101,.. 1.450030 -8047.013677 -2143.823229
73 suppressed 0.693591 -15905.953102 0.997153 0.195337 5.104782 1864,.. 1.526860 -8521.780361 -2896.593651
22 suppressed 0.693492 -16180.359940 0.999066 0.145244 6.878535 4600,.. 1.491685 -9230.592814 -54.956971
37 suppressed 0.693463 -12675.489582 1.004201 0.131172 7.655599 1798,.. 1.885736 -10392.263597 -7123.273765
51 suppressed 0.693445 -15151.229348 0.994113 0.132859 7.482468 6056,.. 1.614267 -11064.439221 -189.300936
63 suppressed 0.693369 -15653.765872 0.999983 0.153297 6.523169 2486,.. 1.631502 -8682.278627 -2330.610230
146 suppressed 0.693074 -16487.993317 1.017384 0.179676 5.662324 6159,.. 1.396452 -9417.684588 834.733301
50 suppressed 0.693036 -14052.624639 0.970005 0.117852 8.230672 6736,.. 1.675559 -13023.202721 16.495359
20 suppressed 0.693028 -12364.355925 1.003365 0.115630 8.677385 3367,.. 1.765088 -14711.046014 -4489.799946
194 suppressed 0.692945 -15805.385445 0.989678 0.149223 6.632210 6822,.. 1.489779 -9956.787115 256.602057
80 suppressed 0.692882 -14273.457012 1.004135 0.139847 7.180234 10160,.. 1.578089 -12072.029588 1076.776803
116 suppressed 0.692686 -18097.210380 1.016817 0.285228 3.564926 2972,.. 1.087331 -7160.449564 593.454611
132 suppressed 0.692673 -18432.020836 0.989300 0.318265 3.108419 1567,.. 1.016520 -5534.968765 -807.381211
180 suppressed 0.692585 -16033.236868 1.001339 0.156398 6.402497 1619,.. 1.558806 -7120.521525 -3753.549888
15 suppressed 0.692562 -18077.831238 0.999079 0.290434 3.439956 1318,.. 1.106852 -5624.860606 -1572.510799
9 suppressed 0.692526 -14921.955589 1.000281 0.090554 11.046244 4794,.. 1.701320 -10596.164661 -1395.522557
167 suppressed 0.692142 -13294.262711 1.001934 0.249682 4.012837 927,.. 1.347249 3253.720938 18514.245775
65 suppressed 0.692015 -15158.321299 0.999915 0.268486 3.724273 2478,.. 1.166006 -12583.383401 -1746.586390
40 suppressed 0.691969 -14626.548881 0.998482 0.177429 5.627515 999,.. 1.559129 -8725.830443 -5958.660394
85 suppressed 0.691758 -12029.802566 0.987214 0.056552 17.456793 5942,.. 1.867132 -15302.839376 -3199.799481
145 suppressed 0.691695 -14977.745283 0.998701 0.092298 10.820350 4117,.. 1.700267 -10412.430478 -1534.004474
121 suppressed 0.691649 -16852.495659 1.002502 0.198143 5.059479 1279,.. 1.448922 -5766.831824 -3153.164930
109 suppressed 0.691624 11.105179 0.990409 0.092495 10.707753 -13237,.. 2.294190 32030.184953 742.500663
42 suppressed 0.691413 -14119.200441 0.999907 0.162818 6.141246 1923,.. 1.683997 -10660.683793 -4411.154839
102 suppressed 0.691146 -15496.540162 1.004959 0.155285 6.471688 6389,.. 1.556367 -9758.924938 283.301817
95 suppressed 0.690974 -13939.120994 0.999838 0.264063 3.786364 -6590,.. 1.228277 14836.846520 -360.605470
88 suppressed 0.690820 -14844.309111 1.014893 0.167929 6.043600 2945,.. 1.416806 -12693.119033 -1712.761478
188 suppressed 0.690502 -12473.763646 0.994681 0.161716 6.150772 480,.. 1.511505 3134.442980 19726.771878
4 suppressed 0.690244 -15635.011527 1.001321 0.339025 2.953533 808,.. 1.074127 997.594195 15398.035053
182 suppressed 0.689944 -14860.967741 1.017382 0.367334 2.769637 979,.. 1.026860 373.566810 17919.754793
136 suppressed 0.689725 -15719.484869 1.009689 0.213316 4.733294 1237,.. 1.362489 1993.645529 13282.476210
108 suppressed 0.689267 -16868.286217 1.010455 0.398269 2.537115 1003,.. 0.951770 500.213960 13192.986270
179 suppressed 0.688211 -14695.841635 1.002931 0.144639 6.934023 3931,.. 1.532106 -12959.688972 -216.249924
78 suppressed 0.688161 -14766.394484 0.995935 0.285833 3.484327 269,.. 1.178970 2963.916969 -9866.555736
58 suppressed 0.688110 -14520.763303 0.993134 0.221677 4.480086 -57,.. 1.317097 3121.352760 -10271.443008
158 suppressed 0.687926 -12973.184141 1.006087 0.053497 18.806378 6828,.. 2.049839 -11560.467523 -3524.108415
69 suppressed 0.687829 -15418.965515 0.993655 0.194727 5.102813 67,.. 1.446613 2510.251568 -9048.627678
97 suppressed 0.687648 -14316.538926 0.998669 0.093484 10.682791 334,.. 1.750742 5900.149692 11360.419684
101 suppressed 0.687579 -15265.916851 1.000543 0.210953 4.742970 54,.. 1.394806 2837.518386 -9249.189096
144 suppressed 0.687376 -17053.219866 0.996680 0.255810 3.896177 267,.. 1.238990 2153.952743 -6639.572600
199 suppressed 0.687220 -11473.991534 0.986581 0.147360 6.695019 -67,.. 1.615409 3719.498433 -14477.496567
150 suppressed 0.687185 -18000.814671 0.999958 0.310344 3.222093 99,.. 1.064935 1899.399844 -5308.274796
119 suppressed 0.687071 -15847.464550 1.018856 0.171281 5.948439 4099,.. 1.444477 -11383.878598 737.467934
104 suppressed 0.686985 -17326.854985 1.002006 0.357138 2.805658 456,.. 1.042707 2365.938768 -6175.180399
159 suppressed 0.686388 -12400.171387 0.968001 0.177878 5.441929 -209,.. 1.478802 4078.192048 -12958.317469
61 suppressed 0.686180 -18478.487653 1.000314 0.323603 3.091179 6,.. 0.965752 1754.121881 -4706.724191
91 suppressed 0.685937 -16890.233842 0.990974 0.403501 2.455938 945,.. 0.877447 -500.576252 13477.920046
39 suppressed 0.685844 -12491.236604 0.992076 0.189296 5.240883 14,.. 1.431698 4059.983855 -12945.296033
84 suppressed 0.685798 -16772.993369 1.001093 0.164108 6.100214 533,.. 1.397848 4062.851661 8565.301443
164 suppressed 0.685721 -15322.547333 1.000197 0.268926 3.719232 -64,.. 1.206657 2862.452431 -9067.590347
191 suppressed 0.685667 -13083.237456 0.997780 0.051077 19.534712 9402,.. 1.822835 -14663.426470 925.053849
34 suppressed 0.684785 -19254.480430 1.010454 0.423364 2.386724 927,.. 0.801820 2243.350482 6157.224735
186 suppressed 0.684724 -14336.818556 0.995270 0.065934 15.094839 4920,.. 1.799114 -12694.830772 949.860274
125 suppressed 0.684648 -7343.474624 1.003233 0.003889 257.959484 13378,.. 2.425804 -16410.723754 -8085.906606
195 suppressed 0.684487 -15088.262603 1.010624 0.081418 12.412841 4535,.. 1.707507 -11767.027918 1212.078376
44 suppressed 0.684426 -12602.203000 0.992964 0.118584 8.373496 2830,.. 1.587170 -16689.161603 -1470.724165
57 suppressed 0.684315 -15432.307927 0.987260 0.103845 9.507039 3405,.. 1.599209 -11344.106114 1090.086289
35 suppressed 0.684293 -15977.929676 1.005304 0.146402 6.866719 3560,.. 1.468133 -10904.424687 1216.334643
90 suppressed 0.683921 -10368.230592 1.006699 0.020568 48.945316 9070,.. 2.053029 -18137.990629 1116.859320
8 suppressed 0.683656 -14324.525128 1.003344 0.067062 14.961539 3683,.. 1.755301 -12820.389101 1245.937888
143 suppressed 0.683650 -13322.100404 1.004908 0.044540 22.561888 4260,.. 1.889706 -13977.907723 1415.837901
149 suppressed 0.683370 -9704.420410 0.989848 0.040784 24.270205 3023,.. 1.967927 -20216.342673 -1731.741537
13 suppressed 0.683208 -12418.264857 1.002041 0.032487 30.844432 3250,.. 1.941987 -14761.860714 1700.616363
81 suppressed 0.683157 -8065.420824 0.997320 0.026629 37.452357 5560,.. 2.111768 -22311.211588 -1943.523849
155 suppressed 0.683082 -5851.033242 1.003461 0.015278 65.681385 4198,.. 2.211873 -23059.925071 1637.390623
162 suppressed 0.683080 -13541.997285 1.003286 0.067781 14.801812 2761,.. 1.737107 -14044.565526 979.855649
118 suppressed 0.682673 -12636.113605 0.996464 0.097281 10.243117 784,.. 1.652231 -1953.723729 18329.698956
129 suppressed 0.682144 -15346.049849 1.001480 0.170969 5.857670 1160,.. 1.378440 -1449.402411 14096.422182
10 suppressed 0.682071 -15331.747861 0.999931 0.149585 6.684724 823,.. 1.418027 -1666.154083 13136.159489
94 suppressed 0.681737 -9516.486317 0.999176 0.029912 33.403877 2670,.. 2.135083 -18170.578388 779.589003
126 suppressed 0.681299 -16683.012731 1.014760 0.192281 5.277496 1351,.. 1.308661 -9672.752252 1127.767688
47 suppressed 0.681072 -11282.463532 0.985347 0.102623 9.601607 1039,.. 1.626720 -1918.956494 22000.990635
1 suppressed 0.680545 -14839.037213 1.003692 0.083693 11.992469 829,.. 1.717836 -676.043069 10562.278890
31 suppressed 0.680447 -14652.873022 1.015388 0.171427 5.923161 1006,.. 1.402128 -1204.504112 15714.993221
110 suppressed 0.680445 -10737.317516 0.996909 0.038054 26.197516 1706,.. 1.984588 -17787.806490 -1832.951702
103 suppressed 0.680326 -9534.988898 1.027888 0.061813 16.629039 2996,.. 1.967403 -18716.013085 -6072.519659
198 suppressed 0.680190 -14300.365282 1.009282 0.350376 2.880571 -7534,.. 1.057658 14420.431194 -580.803961
64 suppressed 0.680186 -13690.852850 0.974986 0.254555 3.830155 1094,.. 1.124125 -2341.691988 19657.579257
172 suppressed 0.679898 -12051.276254 0.978729 0.162753 6.013589 456,.. 1.591660 3391.185915 -12443.880252
156 suppressed 0.679765 -9414.953722 1.018092 0.040578 25.089760 809,.. 2.035834 -1694.158014 22864.795059
86 suppressed 0.678314 -18485.870253 1.008287 0.315098 3.199922 425,.. 0.955099 -1446.853313 7834.074132
166 suppressed 0.674830 -10213.109846 1.006027 0.009063 111.007384 589,.. 2.314159 1374.735976 15516.958973
17 suppressed 0.674305 -7399.358588 0.994818 0.006210 160.208548 -26,.. 2.443177 6976.009462 18384.433629
33 suppressed 0.674213 -11999.325020 0.997773 0.026210 38.067764 947,.. 2.166395 2392.176925 14258.187687
153 suppressed 0.667498 -10779.112703 0.995397 0.125103 7.956619 3265,.. 1.909201 1601.895608 -14270.021543
160 suppressed 0.666145 -12184.310797 0.953825 0.092485 10.313298 1992,.. 1.802685 1670.779356 -11301.186076
154 suppressed 0.660428 -11935.061554 1.001120 0.144910 6.908575 4217,.. 1.810810 1509.728158 -11271.466543
161 suppressed 0.658164 -9106.352917 1.010160 0.048330 20.901503 -8894,.. 2.054313 17810.362985 2576.397405
In [14]:
# visualisation of the motif with the highest r
print('I: Best motif according to r from the repeated optimizations.')
print('I: PCA components: %i, %i' %(df_repetitions.iloc[0]['PCA1'], df_repetitions.iloc[0]['PCA2']))
model_best_repetition=df_repetitions.iloc[0]['model']
model_best_repetition.analyse_motif(X,y, THRESHOLD, nuc_type=NUC_TYPE) 
# store results and display stages
STAGES.append('best repetition', model_best_repetition)
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
I: Best motif according to r from the repeated optimizations.
I: PCA components: 19825, -1553
I: energy matrix and logos:

       A      C     G     T
0 -7053  14371 -8323  1004
1  5640  -4676  5637 -6602
2   349  -2266  2198  -281
3  -790   1047   965 -1221

I: summed absolute energies of each position:
0    30752
1    22557
2     5096
3     4024
dtype: int64

I: averaged summed energy over all positions: 15607
I: Mean and Standard Deviation for the Free Energy G to all subsequences of all probes: -9109 +/- 9135
I: Plot of the Occupancy of a subsite as the function of the Free Energy G 
   overlaid with the distribution of the Free Energy of all subsites.
I: There shall be only a small overlap of both curves. i.e. only the most negative Free Energies
    lead to a measurable occupancy.
I: Calculated occupancy over all subsite of a single probe:
   binding:  0.13331 .. 0.97203 (ratio: 7.3)
I: number of probes: 3149
I: Pearson Correlation  r: 0.7070
I: mean absolute error: 0.1028
I: Classification performance AUROC: 0.8295
stage protein # probes motif length r AUROC G0 G0 fitted ratio max binding min binding energies model logo
0 best repetition T7 primase 3149 4 0.706996 0.829541 -9177.028204 True 7.291495 0.972033 0.133311 -7053,.. suppressed
In [15]:
# refit model to minimize mae
last_model=STAGES.df.at[max(STAGES.df.index),'model']
refitted_model_best_repetition=model_best_repetition.refit_mae(X,y)

# print & display main results
refitted_model_best_repetition.analyse_motif(X,y, THRESHOLD, nuc_type=NUC_TYPE)

# store results and display stages
STAGES.append('mae optimzed', refitted_model_best_repetition)
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
Optimization terminated successfully.
         Current function value: 0.098459
         Iterations: 8
         Function evaluations: 1734
I: energy matrix and logos:

       A      C     G     T
0 -6638  15358 -8211  -508
1  5720  -4656  5614 -6678
2    35  -2089  2203  -149
3  -482   1071   713 -1302

I: summed absolute energies of each position:
0    30716
1    22669
2     4478
3     3569
dtype: int64

I: averaged summed energy over all positions: 15358
I: Mean and Standard Deviation for the Free Energy G to all subsequences of all probes: -9090 +/- 9549
I: Plot of the Occupancy of a subsite as the function of the Free Energy G 
   overlaid with the distribution of the Free Energy of all subsites.
I: There shall be only a small overlap of both curves. i.e. only the most negative Free Energies
    lead to a measurable occupancy.
I: Calculated occupancy over all subsite of a single probe:
   binding:  0.13499 .. 0.97839 (ratio: 7.3)
I: number of probes: 3149
I: Pearson Correlation  r: 0.6931
I: mean absolute error: 0.0985
I: Classification performance AUROC: 0.8204
stage protein # probes motif length r AUROC G0 G0 fitted ratio max binding min binding energies model logo
0 best repetition T7 primase 3149 4 0.706996 0.829541 -9177.028204 True 7.291495 0.972033 0.133311 -7053,.. suppressed
1 mae optimzed T7 primase 3149 4 0.693057 0.820350 -9177.028204 True 7.291495 0.978392 0.134989 -6638,.. suppressed
In [16]:
### Based on the motif of CORE_MOTIF_LENGTH analyze the neigbouring positions 
### whether their inclusion can improve the quality of the motif
df_positions=model_best_repetition.investigate_extension_parallel(X, y, end5=3, end3=3, nuc_type=NUC_TYPE)

list_positions=df_positions.index[df_positions['+2%']].tolist()+[0] # list of positions with an increase of2% and default position 0
ext5=-min(list_positions)
ext3=max(list_positions)
print("I: It is suggested to extend the core motif at the 5' end by %i and at the 3' end by %i positions." %(ext5, ext3))
  0%|          | 0/6 [00:00<?, ?engine/s]
job5:   0%|          | 0/3 [00:00<?, ?tasks/s]
job3:   0%|          | 0/3 [00:00<?, ?tasks/s]
I: Optimization took 0.02 hours.
I: It is suggested to extend the core motif at the 5' end by 0 and at the 3' end by 0 positions.
In [17]:
### fit & predict optimization starting with extended energy matrix if extension appears to improve prediction

if ext5+ext3!=0: #extension suggestion from previous analysis of the bordering positions
    expanded_energies=model_best_repetition.energies_
    # append energies of single-optimized bordering positions to energies of central part
    if ext5!=0:
        energies_5=np.concatenate(df_positions['energies'][(df_positions.index<0) & (df_positions.index>=-ext5)].to_numpy())
        expanded_energies=np.concatenate((energies_5, expanded_energies))
    if ext3!=0:
        energies_3=np.concatenate(df_positions['energies'][(df_positions.index<=ext3) & (df_positions.index>0)].to_numpy().flatten())
        expanded_energies=np.concatenate((expanded_energies,  energies_3))

    mf.energies2logo(expanded_energies, nuc_type=NUC_TYPE)
    print('I: Optimization started with following extended motif.')
    expanded_motif_length=len(expanded_energies)//4
    
    
    model_extended=mf.findmotif(motif_length=expanded_motif_length, protein_conc=PROT_CONC, both_strands=BOTH_STRANDS,
                   start=expanded_energies, fit_G0=True)

    start = time()
    model_extended.fit(X,y)
    print("Optimization took %.2f hours." % ((time() - start)/3600))

    # print & display main results
    model_extended.analyse_motif(X,y, THRESHOLD, nuc_type=NUC_TYPE)

    # store results and display stages
    STAGES.append('extended', model_extended)
    mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
else:
    print('I: Motif is not extended based on previous analysis.')
I: Motif is not extended based on previous analysis.
In [18]:
### fit & predict optimization starting with extended energy matrix plus one bordering position on each side if current bordering position exceed the information of 0.25

last_model=STAGES.df.at[max(STAGES.df.index),'model']
I_5=mf.energies2information(last_model.energies_[0:4])>=0.25 #sufficient information content of 5' end position
I_3=mf.energies2information(last_model.energies_[-4:])>=0.25 #sufficient information content of 3' end position

if I_5 or I_3:
    print('I: At least one of the bordering positions of the current motif has an information content of at least 0.25. Extending.')
    expanded_energies_with_border=mf.modify_energies(last_model.energies_, end5=I_5, end3=I_3)  
    mf.energies2logo(expanded_energies_with_border, nuc_type=NUC_TYPE)
    motif_length_with_border=len(expanded_energies_with_border)//4

    model_with_border=mf.findmotif(motif_length=motif_length_with_border, protein_conc=PROT_CONC, both_strands=BOTH_STRANDS,
                   start=expanded_energies_with_border, fit_G0=True)


    start = time()
    model_with_border.fit(X,y)
    print("Optimization took %.2f hours." % ((time() - start)/3600))

    # print & display main results
    model_with_border.analyse_motif(X,y, THRESHOLD, nuc_type=NUC_TYPE)

    # store results and display stages
    STAGES.append('expanded, border', model_with_border)
    mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
else:
    print('I: Both bordering positions of the found motif have an information content below 0.25. No futher optimization required.')
I: At least one of the bordering positions of the current motif has an information content of at least 0.25. Extending.
Optimization took 0.23 hours.
I: energy matrix and logos:

       A     C     G     T
0  1225  -591  -397  -236
1 -7152  9225 -8511  6438
2  5548 -4640  5593 -6501
3   428 -2416  2263  -275
4  -937   900  1026  -989

I: summed absolute energies of each position:
0     2451
1    31326
2    22284
3     5383
4     3854
dtype: int64

I: averaged summed energy over all positions: 13060
I: Mean and Standard Deviation for the Free Energy G to all subsequences of all probes: -8708 +/- 7835
I: Plot of the Occupancy of a subsite as the function of the Free Energy G 
   overlaid with the distribution of the Free Energy of all subsites.
I: There shall be only a small overlap of both curves. i.e. only the most negative Free Energies
    lead to a measurable occupancy.
I: Calculated occupancy over all subsite of a single probe:
   binding:  0.07823 .. 0.97513 (ratio: 12.5)
I: number of probes: 3149
I: Pearson Correlation  r: 0.7229
I: mean absolute error: 0.1000
I: Classification performance AUROC: 0.8395
stage protein # probes motif length r AUROC G0 G0 fitted ratio max binding min binding energies model logo
0 best repetition T7 primase 3149 4 0.706996 0.829541 -9177.028204 True 7.291495 0.972033 0.133311 -7053,.. suppressed
1 mae optimzed T7 primase 3149 4 0.693057 0.820350 -9177.028204 True 7.291495 0.978392 0.134989 -6638,.. suppressed
2 expanded, border T7 primase 3149 5 0.722890 0.839472 -8721.403785 True 12.464345 0.975132 0.078234 1225,.. suppressed
In [19]:
last_model=STAGES.df.at[max(STAGES.df.index),'model']
df_relevant_positions=last_model.explore_positions(X, y)
list_positions=df_relevant_positions.index[df_relevant_positions['-2%']].tolist() # list of positions with an increase of2% and default position 0
start_relevant=min(list_positions)
end_relevant=max(list_positions)
red5=-start_relevant
red3=end_relevant-len(df_relevant_positions)+1
print('I: The analysis suggests, that positions between %i to %i contribute significantly to the motif' %(start_relevant, end_relevant))
last_model=STAGES.df.at[max(STAGES.df.index),'model']

if (end_relevant-start_relevant+1)in STAGES.df['motif length'].to_list():
    print('I: No need for a further optimization. An optimization with motif length of %i has already been done.' %(end_relevant-start_relevant+1))
    print('I: Checking whether G0 has been chosen correctly.')
    last_model.investigate_G0(X, y)
else:
    print('I: Bordering positions only marginally contributing towards regression quality are dropped.')
    print('I: New start energy for motif optimization:')
    start_final_model=mf.modify_energies(last_model.energies_, end5=red5, end3=red3)
    mf.energies2logo(start_final_model, nuc_type=NUC_TYPE)
    final_model=mf.findmotif(motif_length=len(start_final_model)//4, protein_conc=PROT_CONC, 
                             both_strands=BOTH_STRANDS, start=start_final_model, fit_G0=True)

    start = time()
    final_model.fit(X,y)
    print("Optimization took %.2f hours." % ((time() - start)/3600))

    # print & display main results
    final_model.analyse_motif(X,y, THRESHOLD, nuc_type=NUC_TYPE)
    
    print('I: Checking whether G0 has been chosen correctly.')
    final_model.investigate_G0(X, y)

    # store results and display stages
    STAGES.append('shrinked', final_model)
    mf.display_df(STAGES.df, nuc_type=NUC_TYPE)  
I: The analysis suggests, that positions between 0 to 4 contribute significantly to the motif
I: No need for a further optimization. An optimization with motif length of 5 has already been done.
I: Checking whether G0 has been chosen correctly.
I: Current G0 = -8721 J/mol (see red broken line in figure below) with r = 0.723.
I: Maximal r is 0.724 at G0=-6721 J/mol (see green broken line below).
I: Maximal occupancy of 2 is reached at G0=-10721 J/mol (see blue broken line below).
I: Maximal occupancy of 0.2 is reached at G0=-4721 J/mol (see blue broken line below).
I: G0 is in a range leading to maximal probe occupancy between 0.2 and 2. Good.
I: Maximal r is close to r achieved with current G0. Good.
In [20]:
# refit model to minimize mae
last_model=STAGES.df.at[max(STAGES.df.index),'model']
refitted_model=last_model.refit_mae(X,y)

# print & display main results
refitted_model.analyse_motif(X,y, THRESHOLD, nuc_type=NUC_TYPE)

# store results and display stages
STAGES.append('final mae optimzed', refitted_model)
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
Optimization terminated successfully.
         Current function value: 0.096293
         Iterations: 11
         Function evaluations: 2870
I: energy matrix and logos:

       A      C     G     T
0  1245   -457  -285  -502
1 -6968  12092 -8413  3288
2  5629  -4538  5498 -6588
3  -240  -2118  2580  -221
4  -608    943   840 -1175

I: summed absolute energies of each position:
0     2491
1    30762
2    22254
3     5161
4     3567
dtype: int64

I: averaged summed energy over all positions: 12847
I: Mean and Standard Deviation for the Free Energy G to all subsequences of all probes: -8674 +/- 8388
I: Plot of the Occupancy of a subsite as the function of the Free Energy G 
   overlaid with the distribution of the Free Energy of all subsites.
I: There shall be only a small overlap of both curves. i.e. only the most negative Free Energies
    lead to a measurable occupancy.
I: Calculated occupancy over all subsite of a single probe:
   binding:  0.07442 .. 0.88433 (ratio: 12.5)
I: number of probes: 3149
I: Pearson Correlation  r: 0.7096
I: mean absolute error: 0.0963
I: Classification performance AUROC: 0.8235
stage protein # probes motif length r AUROC G0 G0 fitted ratio max binding min binding energies model logo
0 best repetition T7 primase 3149 4 0.706996 0.829541 -9177.028204 True 7.291495 0.972033 0.133311 -7053,.. suppressed
1 mae optimzed T7 primase 3149 4 0.693057 0.820350 -9177.028204 True 7.291495 0.978392 0.134989 -6638,.. suppressed
2 expanded, border T7 primase 3149 5 0.722890 0.839472 -8721.403785 True 12.464345 0.975132 0.078234 1225,.. suppressed
3 final mae optimzed T7 primase 3149 5 0.709649 0.823475 -8721.403785 True 12.464345 0.884329 0.074416 1245,.. suppressed
In [21]:
STAGES.df['energies'].apply(mf.energies2information)
Out[21]:
0    2.362101
1    2.285369
2    2.500981
3    2.437509
Name: energies, dtype: float64
In [22]:
STAGES.df.to_json('%s_%s-%s-%s_%s-%s.json' %(PROTEIN_NAME, datetime.now().year, datetime.now().month,datetime.now().day , datetime.now().hour, datetime.now().minute))